\documentclass[11pt]{article}
\usepackage{graphicx} % more modern
%\usepackage{times}
\usepackage{helvet}
\usepackage{courier}
\usepackage{epsf}
\usepackage{amsmath,amssymb,amsfonts,verbatim}
\usepackage{subfigure}
\usepackage{amsfonts}
\usepackage{amsmath}
\usepackage{amsthm}
\usepackage{latexsym}
\usepackage{algpseudocode}
\usepackage{algorithm}
\usepackage{enumerate}
%\usepackage{algorithmic}
\usepackage{multirow}
\usepackage{xcolor}

\def\A{{\bf A}}
\def\a{{\bf a}}
\def\B{{\bf B}}
\def\b{{\bf b}}
\def\C{{\bf C}}
\def\c{{\bf c}}
\def\D{{\bf D}}
\def\d{{\bf d}}
\def\E{{\bf E}}
\def\e{{\bf e}}
\def\F{{\bf F}}
\def\f{{\bf f}}
\def\G{{\bf G}}
\def\g{{\bf g}}
\def\k{{\bf k}}
\def\K{{\bf K}}
\def\H{{\bf H}}
\def\I{{\bf I}}
\def\L{{\bf L}}
\def\M{{\bf M}}
\def\m{{\bf m}}
\def\n{{\bf n}}
\def\N{{\bf N}}
\def\BP{{\bf P}}
\def\R{{\bf R}}
\def\BS{{\bf S}}
\def\s{{\bf s}}
\def\t{{\bf t}}
\def\T{{\bf T}}
\def\U{{\bf U}}
\def\u{{\bf u}}
\def\V{{\bf V}}
\def\v{{\bf v}}
\def\W{{\bf W}}
\def\w{{\bf w}}
\def\X{{\bf X}}
\def\Y{{\bf Y}}
\def\Q{{\bf Q}}
\def\x{{\bf x}}
\def\y{{\bf y}}
\def\Z{{\bf Z}}
\def\z{{\bf z}}
\def\0{{\bf 0}}
\def\1{{\bf 1}}


\def\hx{\hat{\bf x}}
\def\tx{\tilde{\bf x}}
\def\ty{\tilde{\bf y}}
\def\tz{\tilde{\bf z}}
\def\hd{\hat{d}}
\def\HD{\hat{\bf D}}

\def\MA{{\mathcal A}}
\def\ML{{\mathcal L}}
\def\MF{{\mathcal F}}
\def\MR{{\mathcal R}}
\def\MG{{\mathcal G}}
\def\MI{{\mathcal I}}
\def\MN{{\mathcal N}}
\def\MO{{\mathcal O}}
\def\MT{{\mathcal T}}
\def\MX{{\mathcal X}}
\def\SW{{\mathcal {SW}}}
\def\MW{{\mathcal W}}
\def\MY{{\mathcal Y}}
\def\BR{{\mathbb R}}
\def\BP{{\mathbb P}}
\def\BE{{\mathbb E}}
\def\BN{{\mathbb N}}

\def\bet{\mbox{\boldmath$\beta$\unboldmath}}
\def\epsi{\mbox{\boldmath$\epsilon$}}

\def\etal{{\em et al.\/}\,}
\def\tr{\mathrm{tr}}
\def\rk{\mathrm{rk}}
\def\diag{\mathrm{diag}}
\def\dg{\mathrm{dg}}
\def\argmax{\mathop{\rm argmax}}
\def\argmin{\mathop{\rm argmin}}
\def\vecd{\mathrm{vec}}

\def\ph{\mbox{\boldmath$\phi$\unboldmath}}
\def\vp{\mbox{\boldmath$\varphi$\unboldmath}}
\def\pii{\mbox{\boldmath$\pi$\unboldmath}}
\def\Ph{\mbox{\boldmath$\Phi$\unboldmath}}
\def\pss{\mbox{\boldmath$\psi$\unboldmath}}
\def\Ps{\mbox{\boldmath$\Psi$\unboldmath}}
\def\muu{\mbox{\boldmath$\mu$\unboldmath}}
\def\Si{\mbox{\boldmath$\Sigma$\unboldmath}}
\def\lam{\mbox{\boldmath$\lambda$\unboldmath}}
\def\Lam{\mbox{\boldmath$\Lambda$\unboldmath}}
\def\Gam{\mbox{\boldmath$\Gamma$\unboldmath}}
\def\Oma{\mbox{\boldmath$\Omega$\unboldmath}}
\def\De{\mbox{\boldmath$\Delta$\unboldmath}}
\def\de{\mbox{\boldmath$\delta$\unboldmath}}
\def\Tha{\mbox{\boldmath$\Theta$\unboldmath}}
\def\tha{\mbox{\boldmath$\theta$\unboldmath}}

\newtheorem{theorem}{Theorem}[section]
\newtheorem{lemma}{Lemma}[section]
\newtheorem{definition}{Definition}[section]
\newtheorem{proposition}{Proposition}[section]
\newtheorem{corollary}{Corollary}[section]
\newtheorem{example}{Example}[section]


\def\probin{\mbox{\rotatebox[origin=c]{90}{$\vDash$}}}

\def\calA{{\cal A}}



%this is a comment

%use this as a template only... you may not need the subsections,
%or lists however they are placed in the document to show you how
%do it if needed.


%THINGS TO REMEMBER
%to compile a latex document - latex filename.tex
%to view the document        - xdvi filename.dvi
%to create a ps document     - dvips filename.dvi
%to create a pdf document    - dvipdf filename.dvi
%{\bf TEXT}                  - bold font TEXT
%{\it TEXT}                  - italic TEXT
%$ ... $                     - places ... in math mode on same line
%$$ ... $$                   - places ... in math mode on new line
%more info at www.cs.wm.edu/~mliskov/cs423_fall04/tex.html


\setlength{\oddsidemargin}{.25in}
\setlength{\evensidemargin}{.25in}
\setlength{\textwidth}{6in}
\setlength{\topmargin}{-0.4in}
\setlength{\textheight}{8.5in}


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\newcommand{\notes}[5]{
	\renewcommand{\thepage}{#1 - \arabic{page}}
	\noindent
	\begin{center}
	\framebox{
		\vbox{
		\hbox to 5.78in { { \bf Statistical Machine Learning}
		\hfill #2}
		\vspace{4mm}
		\hbox to 5.78in { {\Large \hfill #5 \hfill} }
		\vspace{2mm}
		\hbox to 5.78in { {\it #3 \hfill #4} }
		}
	}
	\end{center}
	\vspace*{4mm}
}

\newcommand{\ho}[5]{\notes{#1}{Distributions}{Professor: Zhihua Zhang}{}{Lecture Notes #1: Multinomial Distribution}}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

%begins a LaTeX document
\setcounter{section}{2}
\setcounter{subsection}{4}
\begin{document}
\ho{4}{2014.03.08}{Moses Liskov}{Name}{Lecture title}
\subsection{More About Mixture Distribution}
\begin{definition}
  In probability theory and statistics, the moment-generating function of a random variable $X$ is 
  \[M_X(t) = \BE[e^{tX}] = \int e^{tx} f_X(x) dx\]
\end{definition}
One property about moment-generating function is that we can get $\BE[X^k]$ from $M_X^{(k)}(0)$, 
as we can see $M_X^{(k)}(t) = \int x^k e^{tx} f_X(x) dx$, where we assume we can put the derivation inside. So $M_X^{(k)}(0) = \BE[X^k]$.

\begin{definition}
  A function $f:(0, \infty) \rightarrow \BR$ is completely monotone function if and only if $f$ is of class $C^\infty$(infinitely derivable), and $(-1)^n f^{(n)}(\lambda) \geq 0$ for all $n \in N\cup\{0\}$, and $\lambda > 0$.
\end{definition}

\begin{theorem}(Bernstein)
  Let $g:(0,\infty) \rightarrow \BR$ be a completely monotone function. Then it is the Laplace transform of an unique measure $\mu$ on $[0,\infty]$, i.e. for all $\lambda > 0$, \[g(\lambda) =  \ML(\mu; \lambda) = \int_{[0, \infty)} e^{-\lambda t} \mu(dt)\]. Conversely, whenever $\ML(\mu; \lambda) < \infty$ for every $\lambda > 0$,  $\lambda \mapsto  \ML(\mu; \lambda)$ is a completely monotone function.
\end{theorem}

\begin{proof}
  Assume $g(0+) = 1$ and $g(+\infty) = 0$. By Taylor's formula
  \begin{equation}\begin{split} 
    f(\lambda) &= \sum_{k=0}^{n-1} \frac{f^{(k)}(a)}{k!} (\lambda - a)^k + \int_a^\lambda \frac{f^{(n)}(s)}{(n-1)!}(\lambda - s)^{n-1} ds \\
    &= \sum_{k=0}^{n-1} \frac{(-1)^k f^{(k)}(a)}{k!} (a-\lambda)^k + \int_\lambda^a \frac{(-1)^nf^{(n)}(s)}{(n-1)!} (s-\lambda)^{n-1} ds  
  \end{split}\end{equation}
  
  where $a > 0$ and $n \in \BN$. Let $a \to \infty$, then 
  \[\begin{split}
    \lim_{a\to\infty} \int_\lambda^a \frac{(-1)^n f^{(n)}(s)}{(n-1)!} (s-\lambda)^{n-1} ds &= 
    \int_\lambda^\infty \frac{(-1)^n f^{(n)}(s)}{(n-1)!} (s-\lambda)^{n-1} ds \\
    &\leq f(\lambda).
  \end{split}\]
  So the sum in (1) converges for every $n \in \BN$ as $a \to \infty$. Let 
  \[ \rho_n(\lambda) = \lim_{a \to \infty} \frac{(-1)^n f^{(n)}(a)}{n!} (a-k)^n\].
  This limit doesn't depend on $\lambda > 0$. Indeed, for $k>0$,
  \[\begin{split}
    \rho_n(k) &= \lim_{a \to \infty} \frac{(-1)^n f^{(n)}(a)}{n!} (a - k)^n  \\
    &= \lim_{a \to \infty} \frac{(-1)^n f^{(n)}(a)}{n!} (a-\lambda)^n \frac{(a -k)^n}{(a-\lambda)^n} \\
    &= \rho_n(\lambda).
  \end{split}\]
  So we can get 
  \[ f(\lambda) = \sum_{k=0}^{n-1} \rho_k(\lambda) + \int_\lambda^\infty \frac{(-1)^n f^{(n)}(s)}{(n-1)!} (s-\lambda)^{n-1} ds\]
  Let $\lambda \to \infty$, since $f(+\infty) = 0$, so $\rho_k(\lambda) = 0$. Then we can get 
  \begin{equation} f(\lambda) = \int_\lambda^\infty \frac{(-1)^nf^{(n)}(s)}{(n-1)!}(s-\lambda)^{n-1} ds\end{equation}. 
  And since $f(0+) = 1$, we can get:
  \[1 = \lim_{\lambda \to 0+} f(\lambda) = \int_0^\infty \frac{(-1)^nf^{(n)}(s)}{(n-1)!} s^{n-1} ds\]
  And (2) can also be written as:
  \[ f(\lambda) = \int_0^\infty (1-\frac{\lambda}{s})_+^{n-1} \frac{(-1)^nf^{(n)}(s)}{(n-1)!} s^{n-1}ds. \]
 Let $t = \frac{n}{s}$, then 
 \[ f(\lambda) = \int_0^\infty (1-\frac{\lambda t}{n})^{n-1}_+ \frac{(-1)^n}{n!}f^{(n)}(\frac{n}{t}) (\frac{n}{t})^{n+1} dt\].
 Since $\lim_{n\to\infty} (1-\frac{\lambda t}{n})^{n-1}_+ = e^{-\lambda t}$. So
 \[ f(\lambda) = \int_0^\infty e^{-\lambda t} \frac{(-1)^n}{n!}f^{(n)}(\frac{n}{t}) (\frac{n}{t})^{n+1} dt.\]
 
 
 For the converse, let $f(\lambda) = \ML(\mu; \lambda) = \int_0^\infty e^{-\lambda t} \mu(dt)$. So
 \[\begin{split}(-1)^n f^{(n)}(\lambda) &= \int_0^\infty t^n e^{-\lambda t} \mu(dt) \geq 0  \end{split}\]

\end{proof}

\textbf{Corollary}
Let $g(t)$ be a function that is symmetric about the origin, integrable, convex and twice differentible on $(0, \infty)$ and $g(0^+) = 1$, $g(+\infty) = 0$ then
\[ g(t) = \int_0^{\infty} \frac{1}{s} (1-\frac{t}{s})_+ s^2 g''(s) ds \]

\begin{theorem}
A function $f(x)$ can be represented as a Gaussian scale mixture iff $f(\sqrt{x})$ is completely monotone on $(0, \infty)$.
\end{theorem}

\begin{proof}
\[\begin{split}
Let~g(x) = f(\sqrt{x}).\\ 
& f(\sqrt{x})~is~completely~monotone,\\ 
\Longleftrightarrow & g(x)~is~completely~monotone.\\
By~Bernstein:\\
\Longleftrightarrow & g(x) = \int_{0}^{\infty}e^{-xt}\mu(\mathrm{d}t)\\
\Longleftrightarrow & f(\sqrt{x}) = \int_{0}^{\infty}e^{-xt}\mu(\mathrm{d}t)\\
\Longleftrightarrow & f(x) = \int_{0}^{\infty}e^{-x^2t}\mu(\mathrm{d}t) = C\int_{0}^{\infty}N(x\mid0, \frac{1}{2t})\mu(\mathrm{d}t),
~~and
\int_{0}^{\infty}\mu(dt) = 1\\
\Longleftrightarrow & f(x)~can~be~represented~as~a~Gaussian~scale~mixture.
\end{split}\]
\end{proof}

\begin{theorem}
If $f(x) > 0$, then $e^{-uf(x)}$ is completely monotone for every $u>0$ iff $f'(x)$ is completely monotone.
\end{theorem}


\begin{proof}
If $e^{-uf(x)}$ is completely monotone for every $u>0$:
\[e^{-\mu f(x)} = \sum_{j=0}^{\infty}\frac{(-1)^j\mu^j}{j!}[f(x)]^j\] and all of its formal derivatives converge uniformly, so we can calculate $\frac{d^n}{dx^n}e^{-\mu f(x)}$ by termwise differentiation.
Since $e^{-\mu f}$ is completely monotone, we have:
\[0\le(-1)^n\frac{d^n}{dx^n}e^{-\mu f(x)} = \sum_{j=1}^{\infty}\frac{\mu^j}{j!}(-1)^{n+j}\frac{d^n}{dx^n}[f(x)]^j\]
As $\mu > 0$, dividing $\mu$, there is:
\[0\le(-1)^{n+1}\frac{d^n}{dx^n}f(x)+\sum_{j=2}^{\infty}\frac{\mu^{j-1}}{j!}(-1)^{n+j}\frac{d^n}{dx^n}[f(x)]^j\]
Then let $\mu \rightarrow 0$:
\[0\le(-1)^{n-1}\frac{d^{n-1}}{dx^{n-1}}f'(x)\]
Eventually, $f'(x)$ is completely monotone.

If $f'(x)$ is completely monotone:
\[(-1)^{n-1}\frac{d^n}{dx^n}f(x)\ge0\]
Let $g(\lambda) = e^{-\lambda},~ \lambda = f(x)$:
\[h(x) = e^{- f(x)} = g(\lambda)\circ f(x)\]
And there is a formula for the n-th derivative of the composition $h = g\circ f$:
\[h^{(n)}(\lambda) = \sum_{(m,i_1,...,i_l)}^{}\frac{n!}{i_1!...i_l!}g^{(m)}(f(\lambda))\prod_{j=1}^{l}(\frac{f^{(j)}(\lambda)}{j!})^{i_j},\]
where $\sum_{j=1}^{l}j\cdot i_j = n$ and $\sum_{j=1}^{l}i_j = m$.

We can see that $n = m + \sum_{j=1}^{l}(j-1)\cdot i_j$.

We have $(-1)^mg^{(m)}(f(x)) \ge 0$ and $(-1)^{j-1}f^{(j)}\lambda \ge 0$. 

So $(-1)^nh^{(n)}(x) \ge 0$ which means $e^{-f(x)}$ is completely monotone.

And $e^{-\mu f(x)}$ is completely monotone.


\end{proof}


\section{Multivariable Distribution}
\setcounter{section}{3}
\section{Multinomial Distribution}

\subsection{Bivariate Distribution}
Given a pair of discrete random variable $X$ and $Y$, define the joint mass distribution by $f_{X,Y}(X=x, Y=y) = \BP(X=x, Y=y) = \BP(X=x \text{ and } Y = y)$.

\begin{definition}
In the continuous case, we call a function $f(x,y)$ a probability density function, if
\begin{enumerate}
\item $f(x,y) \geq 0$ for all $x, y$.
\item $\int_{-\infty}^{+\infty}\int_{-\infty}^{+\infty} f(x,y) dxdy = 1$.
\item for any set $A \subset \BR \times \BR$, $\BP((X,Y) \in A) = \iint\limits_A f(x,y) dxdy$. 
\end{enumerate}
\end{definition}
The cumulative distribution function of joint $(X,Y)$ is given by $F_{X,Y} (x, y) = \BP(X \leq x, Y \leq y)$.

\begin{definition}
If random variable $X$ and $Y$ have joint probability density function $f_{X,Y}(x, y)$, then the marginal distribution function is given by $f_X(x) = \int f_{X, Y}(x, y) dy$.
\end{definition}

\begin{definition}
Random variables $X$ and $Y$ are independent, if for every $A$ and $B$, $\BP(X\in A, Y\in B) = \BP(X\in A)\BP(Y\in B)$.
\end{definition}

\begin{theorem}
Random variables $X$ and $Y$ have joint probability density function $f_{X,Y}$, then $X$ and $Y$ are independent if and only if $f_{X,Y}(x,y) = f_X(x) f_Y(y)$ for all $x$ and $y$.
\end{theorem}

\begin{definition}
If $f_Y(y) > 0$, then the conditional density function given $Y$ is $f_{X|Y}(x | y) = \BP(X = x| Y = y) = \frac{\BP(X=x, Y= y)}{\BP(Y= y)} = \frac{f_{X,Y}(x, y)}{f_Y(y)}$.
\end{definition}

\begin{definition}
Let $X = (X_1, X_2, ..., X_n)$ where $X_i$ is a random variable. We call $X$ a random vector, its probability density function is $f_{X_1, ..., X_n}(x_1, x_2, ..., x_n)$, and the marginal is $f(x_i)= \sum_{x_1, ..., x_{i-1}, x_{i+1}, ..., x_n} f(x_1, ... x_n)$ for discrete case. For continuous case, we will use integral instead.
\end{definition}

\begin{definition}
Let $f(x_1, x_2, ..., x_n)$ be the joint density function of $X_1, X_2, ..., X_n$, $\pi_1, \pi_2, ..., \pi_n$ is a permutation of $\{1, 2, ..., n\}$. If $f(x_1, x_2, ..., x_n) = f(x_{\pi_1}, x_{\pi_2}, ..., x_{\pi_n})$, then $X_1, ..., X_n$ are exchangeable.
\end{definition}

\begin{theorem}(de Finetti)
Let $X_i \subset X$ for all $i\in\{1, 2, ..\}$. Suppose that for any $n$, $x_1, x_2, ..., x_n$ are exchangeable. Then we have \[f(x_1, x_2, ... x_n) = \int \Pi_{i=1}^n f(x_i | \theta) f(\theta) d\theta\] for some parameter $\theta$ with prior distribution $f(\theta)$.
\end{theorem}

\begin{theorem}
If $\theta \sim f(\theta)$ and $X_1, X_2, ..., X_n$ are conditionally iid given $\theta$, then marginally $X_1, X_2, ... X_n$ are exchangeable.
\end{theorem}

\subsection{Transformation}
Random variable $X$ has pdf $f_X$ and cmf $F_X$. Let $Y = g(X)$ be a function of $X$. In the discrete case, the pmf of $Y$ is $f_Y(y) = \BP(Y = y) = \BP(g(X) = y) = \BP(x \in g^{-1}(y))$.

\begin{example}
Suppose $\BP(X=-1) = \BP(X=1) = \frac{1}{4}$ and $\BP(X=0) = \frac{1}{2}$. Let $Y=X^2$. So $\BP(Y=0) = \frac{1}{2}$, $\BP(Y=1)=\frac{1}{2}$.
\end{example}

In the continuous case, the steps to find density of transformation variable is given by:
\begin{enumerate}
\item For each y, find set $A_y = \{x:g(x) \leq y\}$.
\item Find CDF, $F_Y(y) = \BP(Y \leq y) = \BP(g(x) \leq y) = \BP(\{x: g(x)\leq y\}) = \int_{A_y} f_X(x) dx$.
\item $f_Y(y) = F_Y'(y)$.
\end{enumerate}

\begin{example}
  $f_X(x) = e^{-x}$ for $x>0$, and $Y=g(X)=\log X$. Then $F_X(x) = \int_0^x f_X(u) du = 1 - e^{-x}$. $A_Y = \{x: x \leq e^y\}$. $F_Y(y) = \BP(Y \leq y) = \BP(\log x \leq y) = \BP(x \leq e^y) = F_X(e^y) = 1 - e^{-e^y}$. $f_Y(y) = (1 - e^{-e^y})' = e^y e^{-e^y}$.
\end{example}

\begin{example}
  $X \sim Uniform(-1, 3)$, $Y=X^2$. $f_X(x) = \begin{cases}\frac{1}{4} & x\in(-1,3) \\ 0 & \text{o.w.}\end{cases}$. Now let us think about the distribution density of $Y$. $Y$ can take value in $(0, 9)$. 
  \begin{enumerate}
    \item $0 < Y < 1$. $A_y = \{X : X^2 \leq y\} = [-\sqrt{y}, \sqrt{y}]$. $F_Y(y) = \int_{A_y} f_X(x) dx = \frac{1}{2} \sqrt{y}$.
    \item $1 \leq Y < 9$. $A_y = [-1, -\sqrt{y}]$. $F_Y(y) = \int_{A_y} \frac{1}{4} dx = \frac{1}{4}(1 + \sqrt{y})$. 
  \end{enumerate}
  So, $f_Y(y) = \begin{cases} \frac{1}{4\sqrt{y}}  & 0 < y < 1 \\ \frac{1}{8\sqrt{y}}  & 1 \leq y < 9\end{cases}$
\end{example}

If random variable $Z = g(X, Y)$, then the way to find density of $Z$ is given by:
\begin{enumerate}
\item For each $z$, find $A_z = \{(x, y): g(x, y)\leq z\}$.
\item Find CDF $F_Z(z) = \BP(Z \leq z) = \iint\limits_{A_z} f_{X,Y}(x, y) dxdy$.
\item $f_Z(z) = F_Z'(z)$.
\end{enumerate}

\begin{example}
  Let $X_1, X_2 \stackrel{iid}{\sim} Uniform(0, 1)$, $Y = X_1 + X_2$. $f_{X_1, X_2}(x_1, x_2) = \begin{cases} 1 & 0<x_1 < 1, 0 < x_2 < 1 \\ 0 & \text{o.w.} \end{cases}$. $F_Y(y) = \BP(\{(x_1, x_2): (x_1 + x_2) \leq y \}) = \iint\limits_{A_y} f(x_1, x_2) dx_1dx_2 = \begin{cases}\frac{1}{2}y^2 & 0<y<1 \\
  1 - \frac{(1-y)^2}{2} & 1 \leq y \leq 2 \\ 1 & y > 2 \\ 0 & y \leq 0 \end{cases}$. So,
  $f_Y(y) = \begin{cases}y & 0\leq y \leq 1 \\ 1 - y & 1 < y \leq 2 \\ 0 & \text{o.w.} \end{cases}$
\end{example}

\begin{theorem}
Let $X$ have CDF $F_X(x)$ and $Y=g(X)$, and let $\mathcal{X} = \{x : f_X(x) > 0\}$, $\mathcal{Y} = \{y: y = g(x) \text{ for some }x \in $X$\}$
\begin{enumerate}
\item if $g$ is a strictly incresing function on $\mathcal{X}$, $F_Y(g) = F_X(g^{-1}(y))$ for $y \in \mathcal{Y}$.
\item if $g$ is a strictly decreasing function on $\mathcal{X}$ and $X$ is a continuous  random variable. $F_Y(y) = 1 - F_X(g^{-1}(y))$ for $y\in\mathcal{Y}$
\end{enumerate}
\end{theorem}

\begin{theorem}
Let $X$ have continuous pdf $f_X(x)$, $Y=g(X)$, and $g$ is strictly monotone function, then $f_Y(y) = f_X(g^{-1}(y)) |\frac{d}{dy}g^{-1}(y)| $ 
\end{theorem}

\begin{proof}
According to two case in theorem 3.4. 
\begin{enumerate}
\item $g$ is a strictly increasing function on $\mathcal{X}$, then 
$f_Y(y) = \frac{dF_Y(y)}{dy} = f_X(g^{-1}(y)) \frac{dg^{-1}(y)}{dy}$
\item $g$ is a strictly decreasing function on $\mathcal{X}$, then 
$f_Y(y) = \frac{dF_Y{y}}{dy} =-f_X(g^{-1}(y))\frac{dg^{-1}(y)}{dy}$.
\end{enumerate}
So, we can combine them to $f_Y(y) = f_X(g^{-1}(y)) |\frac{dg^{-1}(y)}{dy}|$.
\end{proof}

\begin{theorem}(Probability integral transformation)
Let $X$ has a continuous cdf $F_X(x)$, $Y=F_X(x)$. Then $Y$ has uniform distribution on $(0, 1)$, i.e. $\BP(Y \leq y) = y$ where $0 \leq y \leq 1$.
\end{theorem}

\begin{proof}
$\BP(Y \leq y) = \BP(F_X(x) \leq y) = \BP(F_X^{-1}(F_X(x)) \leq F_X^{-1}(y)) = \BP(x \leq F_X^{-1}(y)) = F_X(F_X^{-1}(y)) = y$.
\end{proof}

\end{document}




